{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ed4f77ae",
   "metadata": {},
   "source": [
    "# How to train and load a LDA model using `pke`\n",
    "\n",
    "A Latent Dirichlet Allocation (LDA) model is required by the TopicalPageRank model for weighting candidates. Below is an example on how to train such model for a given text collection."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "316a4ac4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting git+https://github.com/boudinfl/pke.git\n",
      "  Cloning https://github.com/boudinfl/pke.git to /private/var/folders/_s/dsym612j14gggkqchsd35clh0000gn/T/pip-req-build-7gjb6fw2\n",
      "  Running command git clone --filter=blob:none --quiet https://github.com/boudinfl/pke.git /private/var/folders/_s/dsym612j14gggkqchsd35clh0000gn/T/pip-req-build-7gjb6fw2\n",
      "  Resolved https://github.com/boudinfl/pke.git to commit dec50293ffd25f63a554e5822af13608686bfc77\n",
      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hRequirement already satisfied: nltk in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (3.7)\n",
      "Requirement already satisfied: networkx in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (2.7.1)\n",
      "Requirement already satisfied: numpy in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.22.3)\n",
      "Requirement already satisfied: scipy in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.8.0)\n",
      "Requirement already satisfied: sklearn in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (0.0)\n",
      "Requirement already satisfied: unidecode in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.3.4)\n",
      "Requirement already satisfied: future in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (0.18.2)\n",
      "Requirement already satisfied: joblib in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (1.1.0)\n",
      "Requirement already satisfied: spacy>=3.2.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pke==2.0.0) (3.2.3)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.7)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.3.0)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.6)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.4.2)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.9)\n",
      "Requirement already satisfied: setuptools in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (60.9.3)\n",
      "Requirement already satisfied: pathy>=0.3.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.6.1)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (4.63.1)\n",
      "Requirement already satisfied: jinja2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (3.0.3)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.1)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (1.8.2)\n",
      "Requirement already satisfied: packaging>=20.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (21.3)\n",
      "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (8.0.15)\n",
      "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.9.0)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.27.1)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (2.0.6)\n",
      "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.4.0)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (1.0.6)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy>=3.2.3->pke==2.0.0) (0.7.7)\n",
      "Requirement already satisfied: click in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from nltk->pke==2.0.0) (8.0.4)\n",
      "Requirement already satisfied: regex>=2021.8.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from nltk->pke==2.0.0) (2022.3.15)\n",
      "Requirement already satisfied: scikit-learn in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from sklearn->pke==2.0.0) (1.0.2)\n",
      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from packaging>=20.0->spacy>=3.2.3->pke==2.0.0) (3.0.7)\n",
      "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pathy>=0.3.5->spacy>=3.2.3->pke==2.0.0) (5.2.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy>=3.2.3->pke==2.0.0) (4.1.1)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (3.3)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2.0.12)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (2021.10.8)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy>=3.2.3->pke==2.0.0) (1.26.9)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from jinja2->spacy>=3.2.3->pke==2.0.0) (2.1.1)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from scikit-learn->sklearn->pke==2.0.0) (3.1.0)\n",
      "Building wheels for collected packages: pke\n",
      "  Building wheel for pke (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25h  Created wheel for pke: filename=pke-2.0.0-py3-none-any.whl size=6160309 sha256=0fb41ea38a461b3902a6ee84b063e03f9191e99d2fd3c5c4870d2a8b90ae6838\n",
      "  Stored in directory: /private/var/folders/_s/dsym612j14gggkqchsd35clh0000gn/T/pip-ephem-wheel-cache-vbc3pz88/wheels/8c/07/29/6b35bed2aa36e33d77ff3677eb716965ece4d2e56639ad0aab\n",
      "Successfully built pke\n",
      "Installing collected packages: pke\n",
      "Successfully installed pke-2.0.0\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.1.1 is available.\n",
      "You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/pke/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mRequirement already satisfied: datasets in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (2.2.2)\n",
      "Requirement already satisfied: numpy>=1.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (1.22.3)\n",
      "Requirement already satisfied: huggingface-hub<1.0.0,>=0.1.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.6.0)\n",
      "Requirement already satisfied: pyarrow>=6.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (8.0.0)\n",
      "Requirement already satisfied: xxhash in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (3.0.0)\n",
      "Requirement already satisfied: responses<0.19 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.18.0)\n",
      "Requirement already satisfied: packaging in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (21.3)\n",
      "Requirement already satisfied: requests>=2.19.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (2.27.1)\n",
      "Requirement already satisfied: dill<0.3.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.3.4)\n",
      "Requirement already satisfied: tqdm>=4.62.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (4.63.1)\n",
      "Requirement already satisfied: pandas in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (1.4.2)\n",
      "Requirement already satisfied: aiohttp in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (3.8.1)\n",
      "Requirement already satisfied: multiprocess in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (0.70.12.2)\n",
      "Requirement already satisfied: fsspec[http]>=2021.05.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from datasets) (2022.5.0)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: filelock in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (3.7.0)\n",
      "Requirement already satisfied: pyyaml in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (6.0)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from huggingface-hub<1.0.0,>=0.1.0->datasets) (4.1.1)\n",
      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from packaging->datasets) (3.0.7)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (3.3)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2.0.12)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (1.26.9)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests>=2.19.0->datasets) (2021.10.8)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.0)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (6.0.2)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (4.0.2)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (21.4.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.2.0)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from aiohttp->datasets) (1.7.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pandas->datasets) (2022.1)\n",
      "Requirement already satisfied: python-dateutil>=2.8.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n",
      "Requirement already satisfied: six>=1.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from python-dateutil>=2.8.1->pandas->datasets) (1.16.0)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.1.1 is available.\n",
      "You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/pke/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0mCollecting en-core-web-sm==3.2.0\n",
      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.9/13.9 MB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
      "\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from en-core-web-sm==3.2.0) (3.2.3)\n",
      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
      "Requirement already satisfied: numpy>=1.15.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.22.3)\n",
      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.7)\n",
      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)\n",
      "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.15)\n",
      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)\n",
      "Requirement already satisfied: pathy>=0.3.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)\n",
      "Requirement already satisfied: jinja2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.3)\n",
      "Requirement already satisfied: setuptools in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (60.9.3)\n",
      "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)\n",
      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)\n",
      "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.9.0)\n",
      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)\n",
      "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)\n",
      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)\n",
      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.27.1)\n",
      "Requirement already satisfied: packaging>=20.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (21.3)\n",
      "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.7)\n",
      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.63.1)\n",
      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.9)\n",
      "Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.7)\n",
      "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)\n",
      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.1.1)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2021.10.8)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.9)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3)\n",
      "Requirement already satisfied: charset-normalizer~=2.0.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.12)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.4)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /Users/boudin-f/Documents/GitHub/pke/venv/lib/python3.10/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.1.1)\n",
      "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 22.1.1 is available.\n",
      "You should consider upgrading via the '/Users/boudin-f/Documents/GitHub/pke/venv/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n",
      "\u001b[0m\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
      "You can now load the package via spacy.load('en_core_web_sm')\n"
     ]
    }
   ],
   "source": [
    "!pip install git+https://github.com/boudinfl/pke.git\n",
    "!pip install datasets\n",
    "!python -m spacy download en_core_web_sm"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "06de9943",
   "metadata": {},
   "source": [
    "## Preamble on keyphrase extraction datasets using 🤗 datasets\n",
    "\n",
    "For simplicity and ease of use, we rely on the `datasets` module from 🤗 huggingface to load and access sample documents from keyphrase extraction datasets. Please have a look at our organization page (https://huggingface.co/taln-ls2n) for more information on which datasets are available."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "07358b3c",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "No config specified, defaulting to: inspec/raw\n",
      "Reusing dataset inspec (/Users/boudin-f/.cache/huggingface/datasets/taln-ls2n___inspec/raw/1.1.0/0ae146cabe770846946b3279b4c751efe0aca2dd68b3f24427d4624cd22bb20d)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fbf2d3d261a340268bddecd23b1f9bf9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from datasets import load_dataset\n",
    "\n",
    "# load the inspec dataset\n",
    "dataset = load_dataset('taln-ls2n/inspec')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "34f21333",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "3a51c7cee42d446caef1d91f38ddc8ca",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1000 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import spacy\n",
    "from tqdm.notebook import tqdm\n",
    "from spacy.tokenizer import _get_regex_pattern\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n",
    "\n",
    "# Tokenization fix for in-word hyphens (e.g. 'non-linear' would be kept \n",
    "# as one token instead of default spacy behavior of 'non', '-', 'linear')\n",
    "# https://spacy.io/usage/linguistic-features#native-tokenizer-additions\n",
    "\n",
    "from spacy.lang.char_classes import ALPHA, ALPHA_LOWER, ALPHA_UPPER\n",
    "from spacy.lang.char_classes import CONCAT_QUOTES, LIST_ELLIPSES, LIST_ICONS\n",
    "from spacy.util import compile_infix_regex\n",
    "\n",
    "# Modify tokenizer infix patterns\n",
    "infixes = (\n",
    "    LIST_ELLIPSES\n",
    "    + LIST_ICONS\n",
    "    + [\n",
    "        r\"(?<=[0-9])[+\\-\\*^](?=[0-9-])\",\n",
    "        r\"(?<=[{al}{q}])\\.(?=[{au}{q}])\".format(\n",
    "            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES\n",
    "        ),\n",
    "        r\"(?<=[{a}]),(?=[{a}])\".format(a=ALPHA),\n",
    "        # ✅ Commented out regex that splits on hyphens between letters:\n",
    "        # r\"(?<=[{a}])(?:{h})(?=[{a}])\".format(a=ALPHA, h=HYPHENS),\n",
    "        r\"(?<=[{a}0-9])[:<>=/](?=[{a}])\".format(a=ALPHA),\n",
    "    ]\n",
    ")\n",
    "\n",
    "infix_re = compile_infix_regex(infixes)\n",
    "nlp.tokenizer.infix_finditer = infix_re.finditer\n",
    "\n",
    "# populates a docs list with spacy doc objects\n",
    "docs = []\n",
    "for sample in tqdm(dataset['train']):\n",
    "    docs.append(nlp(sample[\"title\"]+\". \"+sample[\"abstract\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "e2800131",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.9/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n",
      "  warnings.warn(msg, category=FutureWarning)\n",
      "INFO:root:writing LDA model to inspec.lda.pickle.gz\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "from pke import compute_lda_model\n",
    "from string import punctuation\n",
    "\n",
    "logging.basicConfig(level=logging.INFO)\n",
    "\n",
    "compute_lda_model(\n",
    "    documents=docs,\n",
    "    output_file=\"inspec.lda.pickle.gz\",\n",
    "    n_topics=500,               # number of topics\n",
    "    language='en',              # language of the input files\n",
    "    stoplist=list(punctuation), # stoplist (punctuation marks)\n",
    "    normalization='stemming'    # use porter stemmer\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "3fbe0621",
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "cannot import name 'load_lda_model' from 'pke' (/usr/local/lib/python3.9/site-packages/pke/__init__.py)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Input \u001b[0;32mIn [5]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpke\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m load_lda_model\n\u001b[1;32m      3\u001b[0m lda_model \u001b[38;5;241m=\u001b[39m load_lda_model(input_file\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minspec.lda.pickle.gz\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
      "\u001b[0;31mImportError\u001b[0m: cannot import name 'load_lda_model' from 'pke' (/usr/local/lib/python3.9/site-packages/pke/__init__.py)"
     ]
    }
   ],
   "source": [
    "from pke import load_lda_model\n",
    "\n",
    "lda_model = load_lda_model(input_file=\"inspec.lda.pickle.gz\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
